{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env ython\n",
    "\n",
    "from datetime import datetime\n",
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "\n",
    "# __file__ = '1code/format_links.py'\n",
    "# ROOT   = Path('~/Dropbox/D67').resolve()\n",
    "#ROOT   = Path(__file__).resolve().parents[1]\n",
    "ROOT   = Path('C:/Users/xiy333/Dropbox/IGNITE/2input_data/D67').resolve()\n",
    "OUT    = (ROOT / '3ROAs').resolve()\n",
    "CASES  = ROOT / '2case_number/current_inmate_ROA.xlsx'\n",
    "\n",
    "\n",
    "def main(cases, outfolder):\n",
    "    \"\"\"\n",
    "    Scrape ROAs from the 7th circuit. Takes as input\n",
    "\n",
    "        ../2case_number/case_number_MB.xlsx\n",
    "\n",
    "    with a list of case numbers. Outputs\n",
    "\n",
    "        ../3ROAs/*.html\n",
    "\n",
    "    with scraped web pages (file names are case #). Depends on\n",
    "\n",
    "        undetected_chromedriver\n",
    "        selenium\n",
    "        pandas\n",
    "        bs4\n",
    "\n",
    "    as well as having Chrome/Chromium installed.\n",
    "    \"\"\"\n",
    "\n",
    "    print(sys.version)\n",
    "    print(sys.platform if 'win' in sys.platform else os.linesep.join(os.uname()))\n",
    "\n",
    "    datestr = datetime.today().strftime(\"%H:%M %a %b %d, %Y\")\n",
    "    print(os.linesep.join(['-' * 72, datestr]))\n",
    "\n",
    "    outfolder = Path(outfolder)\n",
    "    outfolder.mkdir(parents=True, exist_ok=True)\n",
    "    df = load_cases(cases).sort_values('case_number').query(\"queryurl != ''\").drop_duplicates()\n",
    "    \n",
    "    with open(cases.parent / 'queryurl_current.txt', 'w') as txt:\n",
    "        txt.writelines('\\n'.join(df.queryurl.values))\n",
    "\n",
    "    with open(cases.parent / 'querycase_current.txt', 'w') as txt:\n",
    "        txt.writelines('\\n'.join(df.querycase.values))\n",
    "    ''' \n",
    "    df.to_excel(cases.parent / 'crosswalk_08feb2023.xlsx', index=False)\n",
    "    '''\n",
    "    isdone = []\n",
    "    errors = []\n",
    "    isnot  = []\n",
    "    errstr = \"External dependency responded with\"\n",
    "    for queryurl, querycase in df[['queryurl', 'querycase']].values:\n",
    "        outfile  = OUT / (querycase + '.html')\n",
    "        if outfile.is_file():\n",
    "            with open(outfile, 'r') as html:\n",
    "                htmlread = html.read()\n",
    "\n",
    "            if 'Error!' in htmlread and errstr in htmlread:\n",
    "                errors += [querycase]\n",
    "            elif querycase in htmlread:\n",
    "                isdone += [querycase]\n",
    "        else:\n",
    "            isnot += [querycase]\n",
    "\n",
    "    print(f\"{len(isdone)} files scraped.\")\n",
    "    print(f\"{len(errors)} files with error.\")\n",
    "    print(f\"{len(isnot)} files pending.\")\n",
    "\n",
    "    datestr = datetime.today().strftime(\"%H:%M %a %b %d, %Y\")\n",
    "    print(os.linesep.join(['-' * 72, datestr]))\n",
    "\n",
    "\n",
    "def load_cases(cases):\n",
    "    \"\"\"\n",
    "    Load case list and pre-format query urls; I was only able to find\n",
    "    case numbers matching the regex below; about 1/5 of cases dropped.\n",
    "    \"\"\"\n",
    "\n",
    "    pattern = r'^.?(\\d{2,2}).+'\n",
    "    search  = \"\"\"\n",
    "        https://micourt.courts.michigan.gov/case-search/court/D67/\n",
    "        case-details?\n",
    "        caseId={case_year}-{case_number}-{case_type}\n",
    "        -01&tenantKey=D67-25-0626170-{tenantType}-00\n",
    "        &searchUrl=%2Fcourt%2FD67%2Fsearch%3FfirstName%3D%26middleName%3D%26lastName%3D%26birthYear%3D0\n",
    "        %26caseNumber%3D{case_number}\n",
    "        %26caseYear%3D{case_year}%26caseType%3D{case_type}%26page%3D1\n",
    "    \"\"\".strip().replace('\\r', '').replace('\\n', '').replace(' ', '').strip()\n",
    "\n",
    "    #df1 = pd.read_excel(cases, sheet_name='Sheet1').iloc[:, :2].dropna()\n",
    "    #df2 = pd.read_excel(cases, sheet_name='Sheet2').iloc[:, :2].dropna()\n",
    "    #df1.columns = ['case_number', 'case_type']\n",
    "    #df2.columns = ['case_number', 'case_type']\n",
    "    #df = pd.concat((df1, df2), axis=0)\n",
    "    df = pd.read_excel(cases, sheet_name='Sheet1').iloc[:, :2].dropna()\n",
    "    df.columns = ['case_number', 'case_type']\n",
    "    df['case_year'] = df.case_number.str.extract(pattern)\n",
    "    df.dropna(inplace=True)\n",
    "    df['case_year']  = 2000 + df['case_year'].astype(int)\n",
    "    # df['tenantType'] = df['case_type'].map(tenantType)\n",
    "    df = df[df['case_year']>2015]\n",
    "    df = df[df['case_year']<=2022]\n",
    "    df['tenantType'] = '00'\n",
    "    df.loc[df['case_type']=='FY', 'tenantType'] = '00'\n",
    "    df['queryurl']   = ['' if pd.isnull(row['tenantType']) else search.format(**row.to_dict()) for _, row in df.iterrows()]\n",
    "    df['querycase']  = ['{case_year}-{case_number}-{case_type}'.format(**row.to_dict()) for _, row in df.iterrows()]\n",
    "    return df\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main(CASES, OUT)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env ython\n",
    "\n",
    "from datetime import datetime\n",
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "\n",
    "# __file__ = '1code/format_links.py'\n",
    "# ROOT   = Path('~/Dropbox/D67').resolve()\n",
    "#ROOT   = Path(__file__).resolve().parents[1]\n",
    "ROOT   = Path('C:/Users/xiy333/Dropbox/IGNITE/2input_data/D67').resolve()\n",
    "OUT    = (ROOT / '3ROAs').resolve()\n",
    "CASES  = ROOT / '2case_number/dist_case_num_OM_SM_06feb2023.xlsx'\n",
    "\n",
    "\n",
    "def main(cases, outfolder):\n",
    "    \"\"\"\n",
    "    Scrape ROAs from the 7th circuit. Takes as input\n",
    "\n",
    "        ../2case_number/case_number_MB.xlsx\n",
    "\n",
    "    with a list of case numbers. Outputs\n",
    "\n",
    "        ../3ROAs/*.html\n",
    "\n",
    "    with scraped web pages (file names are case #). Depends on\n",
    "\n",
    "        undetected_chromedriver\n",
    "        selenium\n",
    "        pandas\n",
    "        bs4\n",
    "\n",
    "    as well as having Chrome/Chromium installed.\n",
    "    \"\"\"\n",
    "\n",
    "    print(sys.version)\n",
    "    print(sys.platform if 'win' in sys.platform else os.linesep.join(os.uname()))\n",
    "\n",
    "    datestr = datetime.today().strftime(\"%H:%M %a %b %d, %Y\")\n",
    "    print(os.linesep.join(['-' * 72, datestr]))\n",
    "\n",
    "    outfolder = Path(outfolder)\n",
    "    outfolder.mkdir(parents=True, exist_ok=True)\n",
    "    df = load_cases(cases).sort_values('case_number').query(\"queryurl != ''\").drop_duplicates()\n",
    "    with open(cases.parent / 'queryurl_OM_SM_recatch.txt', 'w') as txt:\n",
    "        txt.writelines('\\n'.join(df.queryurl.values))\n",
    "\n",
    "    with open(cases.parent / 'querycase_OM_SM_recatch.txt', 'w') as txt:\n",
    "        txt.writelines('\\n'.join(df.querycase.values))\n",
    "\n",
    "    isdone = []\n",
    "    errors = []\n",
    "    isnot  = []\n",
    "    errstr = \"External dependency responded with\"\n",
    "    for queryurl, querycase in df[['queryurl', 'querycase']].values:\n",
    "        outfile  = OUT / (querycase + '.html')\n",
    "        if outfile.is_file():\n",
    "            with open(outfile, 'r') as html:\n",
    "                htmlread = html.read()\n",
    "\n",
    "            if 'Error!' in htmlread and errstr in htmlread:\n",
    "                errors += [querycase]\n",
    "            elif querycase in htmlread:\n",
    "                isdone += [querycase]\n",
    "        else:\n",
    "            isnot += [querycase]\n",
    "\n",
    "    print(f\"{len(isdone)} files scraped.\")\n",
    "    print(f\"{len(errors)} files with error.\")\n",
    "    print(f\"{len(isnot)} files pending.\")\n",
    "\n",
    "    datestr = datetime.today().strftime(\"%H:%M %a %b %d, %Y\")\n",
    "    print(os.linesep.join(['-' * 72, datestr]))\n",
    "\n",
    "\n",
    "def load_cases(cases):\n",
    "    \"\"\"\n",
    "    Load case list and pre-format query urls; I was only able to find\n",
    "    case numbers matching the regex below; about 1/5 of cases dropped.\n",
    "    \"\"\"\n",
    "\n",
    "    pattern = r'^.?(\\d{2,2}).+'\n",
    "    search  = \"\"\"\n",
    "        https://micourt.courts.michigan.gov/case-search/court/D67/\n",
    "        case-details?\n",
    "        caseId={case_year}-{case_number}-{case_type}\n",
    "        -01&tenantKey=D67-25-0626170-01-00\n",
    "        &searchUrl=%2Fcourt%2FD67%2Fsearch%3FfirstName%3D%26middleName%3D%26lastName%3D%26birthYear%3D0\n",
    "        %26caseNumber%3D{case_number}\n",
    "        %26caseYear%3D{case_year}%26caseType%3D{case_type}%26page%3D1\n",
    "    \"\"\".strip().replace('\\r', '').replace('\\n', '').replace(' ', '').strip()\n",
    "\n",
    "    #df1 = pd.read_excel(cases, sheet_name='Sheet1').iloc[:, :2].dropna()\n",
    "    #df2 = pd.read_excel(cases, sheet_name='Sheet2').iloc[:, :2].dropna()\n",
    "    #df1.columns = ['case_number', 'case_type']\n",
    "    #df2.columns = ['case_number', 'case_type']\n",
    "    #df = pd.concat((df1, df2), axis=0)\n",
    "    df = pd.read_excel(cases, sheet_name='Sheet1').iloc[:, :2].dropna()\n",
    "    df.columns = ['case_number', 'case_type']\n",
    "    df['case_year'] = df.case_number.str.extract(pattern)\n",
    "    df.dropna(inplace=True)\n",
    "    df['case_year']  = 2000 + df['case_year'].astype(int)\n",
    "    df = df[df['case_year']>=2020]\n",
    "    df = df[df['case_year']<=2023]\n",
    "    # df['tenantType'] = df['case_type'].map(tenantType)\n",
    "    df['tenantType'] = '01'\n",
    "    df['case_number'] = df.case_number.str.replace(\"-\", \"0\" )\n",
    "    df['queryurl']   = ['' if pd.isnull(row['tenantType']) else search.format(**row.to_dict()) for _, row in df.iterrows()]\n",
    "    df['querycase']  = ['{case_year}-{case_number}-{case_type}'.format(**row.to_dict()) for _, row in df.iterrows()]\n",
    "    return df\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main(CASES, OUT)\n",
    "\n",
    "\n",
    "#Enter script code\n",
    "\n",
    "\"\"\"REQUIRES MANUAL INPUT \n",
    "0. For Linux: uncomment all and remove pyautogui/pyperclip, run on autokey\n",
    "\n",
    "1. Run first block of the notebook\n",
    "\n",
    "2. Search (using Firefox! Need to modify to use other browser) for any case in \n",
    "\n",
    "    https://micourt.courts.michigan.gov/case-search/court/D67/\n",
    "\n",
    "    to trigger the ToS/Captcha at least once.\n",
    "\n",
    "3. Open the developer console via Ctrl+Shift+I and switch to the console\n",
    "   tab, then close.\n",
    "\n",
    "4. Change your downloads folder to\n",
    "\n",
    "    /path/to/D67/3ROAs\n",
    "\n",
    "5. Change ROOT to /path/to/D67\n",
    "\n",
    "6. Run this script with autokey; be sure to use the same browser window\n",
    "\n",
    "7. Repeat as needed (took me a few hours and several restarts; you should\n",
    "   do the captchas when they appear, but otherwise babysitting this script is\n",
    "   pretty passive).\n",
    "\"\"\"\n",
    "\n",
    "from pathlib import Path\n",
    "import pyautogui\n",
    "import time\n",
    "import pyperclip\n",
    "\n",
    "cached     = True\n",
    "shortdelay = 0.2\n",
    "longdelay  = 5\n",
    "retrydelay = 30\n",
    "maxretries = 1\n",
    "\n",
    "ROOT = Path('C:/Users/xiy333/Dropbox/IGNITE/2input_data/D67')\n",
    "OUT  = (ROOT / '3ROAs').resolve()\n",
    "\n",
    "javascript = \"\"\"\n",
    "function download(filename, text) {\n",
    "    var pom = document.createElement('a');\n",
    "    pom.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text));\n",
    "    pom.setAttribute('download', filename);\n",
    "    if (document.createEvent) {\n",
    "        var event = document.createEvent('MouseEvents');\n",
    "        event.initEvent('click', true, true);\n",
    "        pom.dispatchEvent(event);\n",
    "    }\n",
    "    else {\n",
    "        pom.click();\n",
    "    }\n",
    "}\n",
    "\n",
    "function clickExpand() {\n",
    "    var buttons = document.getElementsByTagName('button')\n",
    "    for(var i = 0; i < buttons.length; i++) {\n",
    "        if (buttons[i].getAttribute('aria-expanded') == 'false') {\n",
    "            buttons[i].click();\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\"\"\"\n",
    "\n",
    "def gotopage(queryurl):\n",
    "    time.sleep(3 * shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+l<backspace>\")\n",
    "    #address bar (MAC)\n",
    "    pyautogui.hotkey('ctrl', 'l', 'backspace')\n",
    "    time.sleep(shortdelay)\n",
    "\n",
    "    #clipboard.fill_clipboard(queryurl)\n",
    "    pyperclip.copy(queryurl)\n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+v\")\n",
    "    pyautogui.hotkey('ctrl', 'v')\n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<enter>\")\n",
    "    pyautogui.press('enter')\n",
    "    time.sleep(shortdelay)\n",
    "    \n",
    "    \n",
    "def gotopage_retry(queryurl):\n",
    "    time.sleep(3 * shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+l<backspace>\")\n",
    "    #address bar (MAC)\n",
    "    queryurl = queryurl.replace('tenantKey=D67-25-0626170-00-00','tenantKey=D67-25-0626170-01-00')\n",
    "    pyautogui.hotkey('ctrl', 'l', 'backspace')\n",
    "    time.sleep(shortdelay)\n",
    "\n",
    "    #clipboard.fill_clipboard(queryurl)\n",
    "    pyperclip.copy(queryurl)\n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+v\")\n",
    "    pyautogui.hotkey('ctrl', 'v')\n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<enter>\")\n",
    "    pyautogui.press('enter')\n",
    "    time.sleep(shortdelay)\n",
    "\n",
    "def download(querycase):\n",
    "    #time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+<shift>+i\")\n",
    "    #opening devtools in browser\n",
    "    pyautogui.hotkey('ctrl', 'shift', 'i')\n",
    "    time.sleep(5 * shortdelay)\n",
    "\n",
    "    #clipboard.fill_clipboard(javascript)\n",
    "    pyperclip.copy(javascript)\n",
    "    time.sleep(2 * shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+v\")\n",
    "    pyautogui.hotkey('ctrl', 'v')\n",
    "    time.sleep(2 * shortdelay)\n",
    "    #keyboard.send_keys(\"<enter>\")\n",
    "    pyautogui.press('enter')\n",
    "    \n",
    "    #clipboard.fill_clipboard(\"setTimeout(function() { clickExpand(); }, 1000);\")\n",
    "    pyperclip.copy(\"setTimeout(function() { clickExpand(); }, 1000);\")\n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+v<enter>\")\n",
    "    pyautogui.hotkey('ctrl', 'v', 'enter')\n",
    "    \n",
    "    time.sleep(1)\n",
    "\n",
    "    #clipboard.fill_clipboard(f'download(\"{querycase}.html\", document.documentElement.innerHTML);')\n",
    "    pyperclip.copy(f'download(\"{querycase}.html\", document.documentElement.innerHTML);')\n",
    "    \n",
    "    time.sleep(shortdelay)\n",
    "    #keyboard.send_keys(\"<ctrl>+v<enter>\")\n",
    "    pyautogui.hotkey('ctrl', 'v', 'enter')\n",
    "    time.sleep(1)\n",
    "\n",
    "    #keyboard.send_keys(\"<ctrl>+<shift>+i\")\n",
    "    pyautogui.hotkey('ctrl', 'shift', 'i')\n",
    "    time.sleep(3 * shortdelay)\n",
    "\n",
    "with open(ROOT / '2case_number/queryurl_09mar.txt', 'r') as txt:\n",
    "    queryurls = [x.strip() for x in txt.readlines()]\n",
    "with open(ROOT / '2case_number/querycase_09mar.txt', 'r') as txt:\n",
    "    querycases = [x.strip() for x in txt.readlines()]\n",
    "\n",
    "if cached:\n",
    "    queryzip = [(queryurl, querycase) for queryurl, querycase in zip(queryurls, querycases) if not (OUT / (querycase + '.html')).is_file()]\n",
    "else:\n",
    "    queryzip = list(zip(queryurls, querycases))\n",
    "\n",
    "#dialog.info_dialog(\"Please select browser window\", \n",
    "#                   f\"Please select browser window; {len(queryzip)} queries\")\n",
    "\n",
    "pyautogui.alert(f\"Please select browser window; {len(queryzip)} queries\",\n",
    "               \"Please select browser window\")\n",
    "\n",
    "time.sleep(shortdelay)\n",
    "\n",
    "ntries = 0\n",
    "nrequests = 0\n",
    "vpn_switch = 0\n",
    "errstr = \"External dependency responded with\"\n",
    "# errstr = \"Response status code does not indicate success\"\n",
    "for queryurl, querycase in queryzip:\n",
    "    delete   = False\n",
    "    retrynow = False\n",
    "    outfile  = OUT / (querycase + '.html')\n",
    "    if cached and outfile.is_file():\n",
    "        continue\n",
    "\n",
    "    gotopage(queryurl)\n",
    "    nrequests += 1\n",
    "    time.sleep(longdelay)\n",
    "    download(querycase)\n",
    "    if not outfile.is_file():\n",
    "        ntries += 1\n",
    "        time.sleep(retrydelay)\n",
    "    else:\n",
    "        with open(outfile, 'r') as html:\n",
    "            htmlread = html.read()\n",
    "\n",
    "        if 'Error!' in htmlread and errstr in htmlread:\n",
    "            retrynow = True\n",
    "            delete   = True\n",
    "            ntries  += 1\n",
    "            time.sleep(retrydelay)  # True?\n",
    "        elif querycase not in htmlread: \n",
    "            retrynow = True\n",
    "            delete   = True\n",
    "            ntries  += 1\n",
    "            time.sleep(retrydelay)\n",
    "\n",
    "        if delete:\n",
    "            outfile.unlink()\n",
    "            delete = False\n",
    "\n",
    "        if retrynow:\n",
    "            gotopage_retry(queryurl)\n",
    "            time.sleep(longdelay)\n",
    "            download(querycase)\n",
    "            with open(outfile, 'r') as html:\n",
    "                htmlread = html.read()\n",
    "\n",
    "            if querycase not in htmlread: \n",
    "                outfile.unlink()\n",
    "    '''if nrequests % 19 == 0 and nrequests>10:\n",
    "        vpn_switch += 1\n",
    "        \n",
    "        pyautogui.getWindowsWithTitle('ExpressVPN')[0].maximize()\n",
    "        pyautogui.getWindowsWithTitle('ExpressVPN')[0].activate()\n",
    "\n",
    "        pyautogui.press('enter')\n",
    "\n",
    "        time.sleep(1)\n",
    "\n",
    "        pyautogui.press('tab')\n",
    "        pyautogui.press('tab')\n",
    "        pyautogui.press('tab')\n",
    "        \n",
    "        for i in range(vpn_switch):\n",
    "            pyautogui.press('down')\n",
    "\n",
    "        pyautogui.press('enter')\n",
    "        time.sleep(longdelay)\n",
    "        #pyautogui.getWindowsWithTitle('MiCOURT Case Search')[0].maximize()\n",
    "        pyautogui.getWindowsWithTitle('MiCOURT Case Search')[0].activate()\n",
    "    '''\n",
    "    if ntries > maxretries:\n",
    "        #break\n",
    "        time.sleep(18)\n",
    "        continue\n",
    "        \n",
    "    time.sleep(18)\n",
    "    \n",
    "time.sleep(shortdelay)\n",
    "if ntries > maxretries:\n",
    "    pyautogui.alert(title=\"Problem!\", text=\"Max retries reached; please review last few downloads.\")\n",
    "else:\n",
    "    pyautogui.alert(title=\"Done!\", text=\"Done!\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
